// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
package org.unicode.icu.tool.cldrtoicu.localedistance;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.ibm.icu.impl.locale.LocaleDistance.DISTANCE_SKIP_SCRIPT;
import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_LANG_DISTANCE;
import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_REGION_DISTANCE;
import static com.ibm.icu.impl.locale.LocaleDistance.IX_DEF_SCRIPT_DISTANCE;
import static com.ibm.icu.impl.locale.LocaleDistance.IX_LIMIT;
import static com.ibm.icu.impl.locale.LocaleDistance.IX_MIN_REGION_DISTANCE;
import static java.util.Arrays.asList;

import java.util.Arrays;
import java.util.Map;
import java.util.logging.Logger;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Table;
import com.google.common.collect.TreeBasedTable;

/**
 * Represents the conceptual distance between pairs of language specifications.
 *
 * <p>Mappings for {@code (desired, supported)} pairs are added at one of three
 * levels in the table; language, script and region. Distances can be resolved at
 * any level in the table (e.g. {@code ("en","fr")}, {@code ("en_Latn","ru_Cyrl")}
 * or {@code ("en_Latn_GB", "en_Latn_AU")}).
 *
 * <p>However in reality the "regions" in the table are actually "partition IDs"
 * representing groups of regions with the same language characteristics. For more
 * information on partitions and how they are generated, see {@link PartitionInfo}.
 *
 * <p>This is mentioned here because anyone debugging this code might be surprised
 * to see values like {@code "5"} for a "region" in the code. Using the term
 * "region" matches the conceptual level of the data and is more familiar to most
 * people, whereas "partition ID" would probably be jarring.
 *
 * <p>The builder class is not resuable, and once a table is built, the builder is
 * invalid. Furthermore, since the table data itself is mutable, care must be taken
 * to avoid modifying either the Trie or the returned distance array.
 *
 * <p>Note that internally the {@code '*'} character used as a wildcard for subtags
 * is replaced by the {@code '�'} character (a.k.a ANY), whenever a subtag is
 * passed into the API. This is because the underlying Trie structure generated by
 * the distance table reserves {@code '*'} for a different purpose. This difference
 * is encapsulated within this class and the {@link Trie} class only.
 */
final class DistanceTable {
    private static final Logger logger = Logger.getLogger(DistanceTable.class.getName());

    // Represents a wildcard match in the data table (the equivalent of '*' in
    // <languageMatch> locale subtag). Any incoming subtags are normalized to
    // convert '*' to this character by the builder.
    private static final String ANY = "�";

    // Distances must be in the range [0-127] because bit 7 of the distance value
    // is used for a special flag (DISTANCE_SKIP_SCRIPT). Setting the explicit max
    // to 100 is just a more human readable maximum that satisfies that constraint.
    private static final int MAX_REGION_DISTANCE = 100;

    static final class Builder {
        private final Node rootNode = new Node(-1);
        private int minRegionDistance = MAX_REGION_DISTANCE;

        private Builder() {}

        /**
         * Adds a distance to the table between the specified and desired tuples.
         * This method takes 1, 2 or 3 sequential {@code (desired, supported)} pairs
         * of values corresponding to language subtags, script subtags and regions
         * (partition IDs). All values can be the wildcard '*'.
         */
        public void addDistance(int distance, boolean oneway, String... args) {
            MappingKey key = MappingKey.fromSubtags(args, distance);
            logger.fine(key::toString);
            // Minimum region distance needs to be tracked specially.
            if (key.getDepth() == 3 && distance < minRegionDistance) {
                minRegionDistance = distance;
            }
            addMapping(key);
            if (!oneway && !key.isSymmetrical()) {
                addMapping(key.reverse());
            }
        }

        private void addMapping(MappingKey key) {
            rootNode.addExplicitMapping(key);
            if (key.hasWildcardMappings()) {
                rootNode.addWildcardMappings(key);
            }
        }

        /** Returns the final minimized distance table information. */
        public DistanceTable build() {
            Node defLangNode = rootNode.getAnyNode();
            checkState(defLangNode != null, "missing default language mapping: %s", rootNode);
            Node defScriptNode = defLangNode.getAnyNode();
            checkState(defScriptNode != null, "missing default script mapping: %s", rootNode);
            Node defRegionNode = defScriptNode.getAnyNode();
            checkState(defRegionNode != null, "missing default region mapping: %s", rootNode);

            // Because we prune the data table, it's important to store the default
            // distance values separately.
            int[] distances = new int[IX_LIMIT];
            distances[IX_DEF_LANG_DISTANCE] = defLangNode.distance;
            distances[IX_DEF_SCRIPT_DISTANCE] = defScriptNode.distance;
            distances[IX_DEF_REGION_DISTANCE] = defRegionNode.distance;
            distances[IX_MIN_REGION_DISTANCE] = minRegionDistance;

            // Having determined the distances, prune the Trie to remove any sub-tables
            // where distances could only be determined to be the default value (i.e.
            // where the existence of that sub-table has no effect).
            pruneDefaultDistances(defScriptNode.distance, defRegionNode.distance);
            return new DistanceTable(rootNode, distances);
        }

        @Override
        public String toString() {
            return String.format("minimum region distance: %d\n%s\n", minRegionDistance, rootNode);
        }

        private void pruneDefaultDistances(int defScriptDistance, int defRegionDistance) {
            logger.fine("==== pruning subtables ====");
            rootNode.subtables.values().forEach(langNode -> {
                langNode.subtables.values().forEach(scriptNode -> {
                    if (scriptNode.subtables.size() == 1) {
                        // If a script node *only* contains region data with the default
                        // region distance, that region data can be removed. Since region
                        // is the lowest level, there's no need to worry about "skipping"
                        // anything during lookup (unlike the case below).
                        Node defRegionNode = scriptNode.getAnyNode();
                        checkState(defRegionNode != null,
                                "missing default region node for script: %s", scriptNode);
                        if (defRegionNode.distance == defRegionDistance) {
                            scriptNode.subtables.clear();
                        }
                    }
                });
                // Do the pruning in the "upwards" phase of visitation (after recursion) so
                // if script subtables are pruned, it's visible here.
                if (langNode.subtables.size() == 1) {
                    // If a language node *only* contains script data with the default
                    // script distance, we can't just remove it (because it might contain
                    // region data).
                    Node defScriptNode = langNode.getAnyNode();
                    if (defScriptNode.distance == defScriptDistance) {
                        checkState(defScriptNode != null,
                                "missing default script node for language: %s", langNode);
                        if (defScriptNode.subtables.isEmpty()) {
                            // If the default script node has no region data, remove it.
                            langNode.subtables.clear();
                        } else {
                            // Otherwise mark script data as "skippable", which indicates
                            // it should be written in a compact form in the Trie (while
                            // retaining any region data as normal).
                            langNode.distance |= DISTANCE_SKIP_SCRIPT;
                        }
                    }
                }
            });
            // After pruning we don't expect any data in the top-level default table.
            checkState(rootNode.getAnyNode().subtables.isEmpty(),
                    "invalid table state: %s", rootNode.getAnyNode());
            rootNode.subtables.rowMap().remove(ANY);
        }
    }

    public static Builder builder() {
        return new Builder();
    }

    private final Node rootNode;
    private final int[] distances;

    private DistanceTable(Node rootNode, int[] distances) {
        this.rootNode = rootNode;
        this.distances = distances;
    }

    public Trie getTrie() {
        Trie trie = new Trie();
        rootNode.writeTo(trie.root());
        return trie;
    }

    public int[] getDefaultDistances() {
        return distances;
    }

    @Override
    public String toString() {
        return String.format("default distances: %s\n%s\n", Arrays.toString(distances), rootNode);
    }

    private static final class Node {
        private final Table<String, String, Node> subtables = TreeBasedTable.create();
        // Distance for the lookup so far (-1 for top level nodes).
        private int distance;

        Node(int distance) {
            checkArgument(distance >= -1, "invalid distance: %s", distance);
            this.distance = distance;
        }

        /** Returns the subtable node for the top-level mapping of a key. */
        private Node getNode(MappingKey key) {
            return subtables.get(key.getDesired(), key.getSupported());
        }

        /** Returns the subtable node for the {@code <ANY,ANY>} mapping. */
        Node getAnyNode() {
            return subtables.get(ANY, ANY);
        }

        void addExplicitMapping(MappingKey key) {
            if (key.isLeaf()) {
                if (!putIfAbsent(key)) {
                    logger.fine(() -> String.format("Ignore existing mapping: %s", key));
                }
            } else {
                getIntermediateNode(key).addExplicitMapping(key.getSuffix());
            }
        }

        void addWildcardMappings(MappingKey key) {
            if (key.isLeaf()) {
                putIfAbsent(key);
            } else if (key.isWildcard()) {
                // An intermediate wildcard mapping is applied to all existing sub-nodes.
                // NOTE: This will need to change if we want to support "mixed" wildcard mappings.
                for (Node node : subtables.values()) {
                    node.addWildcardMappings(key.getSuffix());
                }
            } else {
                // An explicit intermediate mapping only affects an existing exact match.
                Node node = getNode(key);
                if (node != null) {
                    node.addWildcardMappings(key.getSuffix());
                }
            }
        }

        /**
         * Adds a new mapping to this node with the specified distance if it didn't already
         * exist.
         *
         * <p>Note: If a mapping already exists, then this method has no effect (even if the
         * existing distance differs from the given distance). This is necessary to for two
         * reasons:
         * <ol>
         *     <li>An earlier match rule may have set an explicit value for the mapping,
         *         and we subsequently try to set a default value (via a wildcard mapping).
         *         This should be ignored, since we want the non-default value to win.
         *         This means it's important to always have explicit {@code <languageMatch>}
         *         rules before any related wildcard rules in the CLDR data.
         *
         *     <li>A preferential {@code <languageMatch>} rule appears earlier in CLDR data.
         *         This occurs because of the way partitions are defined and allows for two
         *         distinct {@code <languageMatch>} rules to generate the same mapping (with
         *         different distances). This is because region variables reference sets of
         *         partition IDs and these are not always disjoint (e.g. "en_*_$!enUS" and
         *         "en_*_GB" both contain the partition ID for "GB").
         * </ol>
         *
         * @return true if a new mapping was added, or if the distances were equal (i.e.
         *         the operation was idempotent).
         */
        private boolean putIfAbsent(MappingKey key) {
            Node node = getNode(key);
            if (node == null) {
                logger.fine(() -> String.format("add: %s", key));
                subtables.put(key.getDesired(), key.getSupported(), new Node(key.getDistance()));
                return true;
            }
            return (key.getDistance() == node.distance);
        }

        /**
         * Returns a sub-node corresponding to the given {@code (desired, supported)} mapping.
         * If the node already exists, it is simply returned, otherwise a new node is created
         * and any existing wildcard mappings are copied into it.
         */
        private Node getIntermediateNode(MappingKey key) {
            Node node = getNode(key);
            if (node == null) {
                // This is expected to succeed because match rules are given in length
                // order (i.e. language only before language+script etc.) and we always
                // expect each group to end with an <ANY,ANY> mapping for the default
                // distance. Thus, for any longer match rule, we should find (at least)
                // the <ANY,ANY> node when looking for intermediate nodes.
                //
                // NOTE: Currently (desired==ANY) if-and-only-if (supported=ANY), so the
                // only non-exact match we can get here is the <ANY,ANY> node. If we ever
                // allow a mix of wildcard/non-wildcard keys, replace the getAnyNode() call
                // with something like the line below:
                // ----
                // Node wildcardMatch = Iterables.find(
                //      asList(getNode(desired, ANY), getNode(ANY, supported), getNode(ANY,ANY)),
                //      Objects::nonNull);
                // ----
                Node wildcardMatch = getAnyNode();
                checkState(wildcardMatch != null, "missing <ANY,ANY> mapping: %s", this);
                // Default distances are the distance between any two *different* unknown
                // subtags (so if the subtags are the same, the distance is zero).
                int distance = key.getDesired().equals(key.getSupported()) ? 0 : wildcardMatch.distance;
                node = new Node(distance);
                node.copySubtablesFrom(wildcardMatch);
                subtables.put(key.getDesired(), key.getSupported(), node);
            }
            return node;
        }

        /** Copies all subtable mappings from the given node into this one. */
        private void copySubtablesFrom(Node src) {
            checkState(subtables.isEmpty());
            src.subtables.cellSet().forEach(
                    c -> subtables.put(c.getRowKey(), c.getColumnKey(), new Node(c.getValue().distance)));
        }

        /**
         * Writes all the mappings in the distance table sequentially to given Trie in sorted
         * table order.
         *
         * <p>Mappings are written in a top-down recursive visitation with sub-tables inheriting
         * the current prefix from parent tables via the given Trie span. At each level any
         * mapped distances are written before recursing into the sub-tables.
         */
        private void writeTo(Trie.Span trieSpan) {
            if (distance >= 0 && (distance & DISTANCE_SKIP_SCRIPT) != 0) {
                // If a node has a distance set and has been explicitly marked as "skippable",
                // then write the "default" subtable using the current Trie prefix (effectively
                // having an "empty" prefix for this case).
                getAnyNode().writeTo(trieSpan);
            } else {
                // In the normal case, just write the mappings explicitly.
                subtables.rowMap().forEach(
                        (desired, supportedNodes) -> writeSupported(trieSpan, desired, supportedNodes));
            }
        }

        private void writeSupported(Trie.Span trieSpan, String desired, Map<String, Node> supportedNodes) {
            // Collapse any (desired=ANY, supported=ANY) mappings into a single '*' in the trie.
            if (desired.equals(ANY)) {
                // If desired is ANY, the only supported subtag must also be ANY.
                Node node = supportedNodes.get(ANY);
                checkState(node != null && supportedNodes.size() == 1,
                        "invalid supported subtags for desired='ANY': %s", supportedNodes);
                // Remember that ANY != "*", even though it corresponds to "*" in the original
                // language match rules. Putting "*" in a Trie means something different (but
                // similar enough to be a bit confusing).
                trieSpan.with("*", node::writeDistancePlusSubtables);
            } else {
                // In the general case, just write the <desired,supported> distance mapping.
                trieSpan.with(desired, withDesiredSpan ->
                        supportedNodes.forEach((supported, node) -> {
                            checkState(!supported.equals(ANY),
                                    "unexpected supported='ANY' subtag: %s", supported);
                            withDesiredSpan.with(supported, node::writeDistancePlusSubtables);
                        })
                );
            }
        }

        // Writes the distance of this node to the given trie, then recursively writes any
        // subtable information.
        private void writeDistancePlusSubtables(Trie.Span trieSpan) {
            trieSpan.putPrefixAndValue(distance);
            writeTo(trieSpan);
        }

        @Override
        public String toString() {
            StringBuilder buffer = new StringBuilder("distance: ").append(distance).append('\n');
            return appendToString("", buffer).toString();
        }

        private StringBuilder appendToString(String indent, StringBuilder buffer) {
            // Top level values are not padded with tabs.
            String rowIndent = indent.isEmpty() ? "" : "\t";
            for (Map.Entry<String, Map<String, Node>> row : subtables.rowMap().entrySet()) {
                buffer.append(rowIndent).append(row.getKey());
                // First column extends the current row, so single tab indent.
                String colIndent = "\t";
                for (Map.Entry<String, Node> col : row.getValue().entrySet()) {
                    buffer.append(colIndent).append(col.getKey());
                    Node subnode = col.getValue();
                    buffer.append('\t').append(subnode.distance);
                    // Append any sub-nodes (starting on the same line).
                    subnode.appendToString(indent + "\t\t\t", buffer).append('\n');
                    // Later columns need full indent (including skipping row key).
                    colIndent = indent + '\t';
                }
                // Later rows need full indent.
                rowIndent = indent;
            }
            return buffer;
        }
    }

    /**
     * Excapsulates a sequence of {@code <desired,supported>} pairwise mappings over
     * language, script and region, with an associated distance. This is an alternate
     * way to represent a mapping of desired and supported language match rules.
     *
     * <p>For example:
     * <pre>{@code
     *   <languageMatch desired="en_*_$!enUS", supported="en_*_$GB", distance="3"/>
     * }</pre>
     * results in a set of keys of the form:
     * <pre>{@code
     *   <en,en> -> <ANY,ANY> -> <X,Y> = 3
     * }</pre>
     * where the "region" part {@code <X,Y>} is constructed from all the possible
     * combinations of partition IDs associated with the original region variables.
     *
     * <p>Mapping keys have several useful properties:
     * <ul>
     *     <li>They can be reversed (e.g. {@code <A,B> -> <C,D> = N} becomes
     *     {@code <B,A> -> <D,C> = N}).
     *     <li>They can be symmetrical (e.g. {@code <X,X> -> <Y,Y> = N}), in which
     *     case the reversed key is the same as the original.
     *     <li>They can have wildcard mappings (i.e. {@code <ANY,ANY>}).
     *     <li>They can produce "suffix" keys (e.g. the suffix of
     *     {@code <A,B> -> <C,D> = N} is {@code <C,D> = N}).
     * </ul>
     */
    private static final class MappingKey {
        /**
         * Returns a new key from the specified subtag pairs, converting {@code '*'}
         * subtags to the special {@code ANY} string and performing consistency checks.
         *
         * @param subtagPairs a sequence of {@code <desired,suported>} pairs.
         * @param distance the distance associated with the subtag mapping.
         */
        static MappingKey fromSubtags(String[] subtagPairs, int distance) {
            int pairCount = subtagPairs.length;
            checkArgument(pairCount == 2 || pairCount == 4 || pairCount == 6,
                    "invalid number of arguments (expected 1, 2 or 3 pairs): %s", asList(subtagPairs));
            ImmutableList.Builder<String> keyPairs = ImmutableList.builder();
            for (String subtag : subtagPairs) {
                keyPairs.add(fixAny(subtag));
            }
            return new MappingKey(keyPairs.build(), distance, false);
        }

        // Converts a '*' (from a subtag) into the wildcard match character used by the Trie.
        // The Trie uses '*' to mean something else, so we convert it at the boundary.
        private static String fixAny(String subtag) {
            return subtag.equals("*") ? ANY : subtag;
        }

        private final ImmutableList<String> pairs;
        private final int distance;
        private final boolean isReversed;
        private final boolean isSymmetrical;
        private final boolean hasWildcardMappings;

        private MappingKey(ImmutableList<String> pairs, int distance, boolean isReversed) {
            this.pairs = pairs;
            this.distance = distance;
            this.isReversed = isReversed;
            checkArgument(distance >= 0 && distance <= MAX_REGION_DISTANCE,
                    "invalid mapping key distance: %s", distance);
            // Check that if a key has "ANY" mappings, it is consistent. We expect to only
            // get <ANY,ANY> pairs (e.g. not <X,ANY> or <ANY,X>).
            boolean isSymmetrical = true;
            boolean hasWildcardMappings = false;
            for (int i = 0; i < pairs.size(); i += 2) {
                String desired = pairs.get(i);
                String supported = pairs.get(i + 1);
                checkArgument(desired.equals(ANY) == supported.equals(ANY),
                        "invalid mapping key pairs: %s", pairs);
                hasWildcardMappings |= desired.equals(ANY);
                isSymmetrical &= desired.equals(supported);
            }
            this.isSymmetrical = isSymmetrical;
            this.hasWildcardMappings = hasWildcardMappings;
        }

        /** Returns the "desired" value of the current (top-level) mapping. */
        String getDesired() {
            return pairs.get(isReversed ? 1 : 0);
        }

        /** Returns the "supported" value of the current (top-level) mapping. */
        String getSupported() {
            return pairs.get(isReversed ? 0 : 1);
        }

        /** Returns the non-negative distance mapped to by this key. */
        int getDistance() {
            return distance;
        }

        /**
         * Returns the number of {@code <desired,supported>} mappings in this key; this is
         * either 1 (language-only), 2 (language & script) or 3 (language, script & region).
         */
        int getDepth() {
            return pairs.size() / 2;
        }

        /** Returns true if this key does not have a suffix. */
        boolean isLeaf() {
            return getDepth() == 1;
        }

        /**
         * Returns if any of the {@code <desired,supported>} mappings are {@code <ANY,ANY>}.
         */
        boolean hasWildcardMappings() {
            return hasWildcardMappings;
        }

        /**
         * Returns if the top-level {@code <desired,supported>} mapping is {@code <ANY,ANY>}.
         */
        boolean isWildcard() {
            return getDesired().equals(ANY);
        }

        /**
         * Returns if this key is pair-wise symmetrical (e.g. {@code "<X,X> -> <Y,Y> = N"}).
         * Symmetrical mappings don't need to be added in reverse.
         */
        boolean isSymmetrical() {
            return isSymmetrical;
        }

        /** Returns a new key where each {@code <desired,supported>} mapping is reversed. */
        MappingKey reverse() {
            checkState(!isReversed, "cannot revese a reversed key");
            return new MappingKey(pairs, distance, true);
        }

        /**
         * Returns the suffix of this non-leaf key with the top-level mapping removed. For
         * example, the suffix of {@code "<A,B> -> <C,D> = N"} is {@code "<C,D> = N"}).
         */
        MappingKey getSuffix() {
            checkState(!isLeaf(), "cannot get 'next' for an empty key");
            return new MappingKey(pairs.subList(2, pairs.size()), distance, isReversed);
        }

        @Override
        public String toString() {
            return isLeaf()
                    ? String.format("<%s, %s> = %d", getDesired(), getSupported(), getDistance())
                    : String.format("<%s, %s> -> %s", getDesired(), getSupported(), getSuffix());
        }
    }
}
